metaviz_long <- rio::import(here::here("data-processed", "metaviz_long.rds"))

Explore Data for My-Project

All variables

By Year

First you can see the number of available observation for each variable in each year

  • x-axis = survey year
  • y-axis = variables
  • size = number of observations
  • color = variable group

empty plot

metaviz_long %>% 
      drop_na(value) %>% 
      mutate(key_name_label = factor(key_name_label),
             order = as.numeric(key_category),
             key_name_label = fct_reorder(key_name_label, desc(order))) %>% 
      ggplot(aes(key_name_label, syear, col = key_category))
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
##         new variable 'order' (double) with 5 unique values and 0% NA

with data points

metaviz_long %>% 
      drop_na(value) %>% 
      mutate(key_name_label = factor(key_name_label),
             order = as.numeric(key_category),
             key_name_label = fct_reorder(key_name_label, desc(order))) %>% 
      ggplot(aes(key_name_label, syear, col = key_category)) +
      geom_count() 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
##         new variable 'order' (double) with 5 unique values and 0% NA

flip the axis

metaviz_long %>% 
      drop_na(value) %>% 
      mutate(key_name_label = factor(key_name_label),
             order = as.numeric(key_category),
             key_name_label = fct_reorder(key_name_label, desc(order))) %>% 
      ggplot(aes(key_name_label, syear, col = key_category)) +
      geom_count() +
      coord_flip()
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
##         new variable 'order' (double) with 5 unique values and 0% NA

final plot

metaviz_long %>% 
      drop_na(value) %>% 
      mutate(key_name_label = factor(key_name_label),
             order = as.numeric(key_category),
             key_name_label = fct_reorder(key_name_label, desc(order))) %>% 
      ggplot(aes(key_name_label, syear, col = key_category)) +
      geom_count() +
      coord_flip() +
      theme(legend.position = "right",
           plot.title.position = "plot") + #so cool <3) 
      guides(col = guide_legend(ncol = 1)) +
      scale_x_discrete(labels = wrap_format(40)) +
      scale_y_continuous(limits = c(1998, 2018), breaks = seq(1998,2018,2)) +
      labs(title = "Number of observations for selected SOEP variables from 1998 - 2018",
           subtitle = "Size indicates number of observations",
           y = "", x = "")
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## mutate: converted 'key_name_label' from character to factor (0 new NA)
##         new variable 'order' (double) with 5 unique values and 0% NA

Overall

Here is an overall plot of the number of available observations for each of the variables. It helps to get a general understanding of the proportions of missings for groups of variables

metaviz_long %>% 
      drop_na(value) %>% 
      filter(key_category != "Psych. Measure") %>% 
      group_by(key) %>% 
      add_count() %>% 
      ungroup() %>% 
      distinct(key, .keep_all = T) %>% 
      group_by(key_category) %>% 
      mutate(key_name_label = fct_reorder(factor(key_name_label), n)) %>% 
      ggplot(aes(x = key_name_label, y = n, fill = key_category, label = n)) +
      geom_col(width = 0.2) +
      geom_point() +
      geom_label(color = "white", size = 2) +
      coord_flip() +
      scale_y_continuous(labels = scales::label_number_auto()) +
      scale_x_discrete(labels = wrap_format(40)) +
      theme_light() +
      theme(legend.position = "none") +
      facet_wrap(~key_category, ncol = 1, scales = "free") +
      labs(title = "Overall Number of observations for selected SOEP variables from 1998 - 2018", y = "", x = "") 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 66,568 rows (14%), 409,994 rows remaining
## group_by: one grouping variable (key)
## add_count (grouped): new variable 'n' (integer) with 30 unique values and 0% NA
## ungroup: no grouping variables
## distinct: removed 409,964 rows (>99%), 30 rows remaining
## group_by: one grouping variable (key_category)
## mutate (grouped): converted 'key_name_label' from character to factor (0 new NA)

## By Variable Category {.tabset}

By Year

ID’s

metaviz_long %>% 
      drop_na(value) %>% 
      filter(key_category == "ID's") %>%
      ggplot(aes(key_name_label, syear)) +
      geom_count(col = "#440154FF") +
      coord_flip() +
      theme(legend.position = "right") +
      scale_x_discrete(labels = wrap_format(40)) +
      scale_y_continuous(limits = c(1998, 2018), breaks = seq(1998, 2018, 2)) +
      labs(title = "Number of observations for selected SOEP variables from 1998 - 2018",
           subtitle = "Size indicates number of observations",
           y = "", x = "") 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 423,393 rows (89%), 53,169 rows remaining

Survey

metaviz_long %>%
  drop_na(value) %>%
  filter(key_category == "Survey") %>%
  ggplot(aes(key_name_label, syear)) +
  geom_count(col = "#3B528BFF") +
  coord_flip() +
  theme(legend.position = "right") +
  scale_x_discrete(labels = wrap_format(40)) +
  scale_y_continuous(limits = c(1998, 2018),
                     breaks = seq(1998, 2018, 2)) +
  labs(
    title = "Number of observations for selected SOEP variables from 1998 - 2018",
    subtitle = "Size indicates number of observations",
    y = "",
    x = ""
  ) 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 381,757 rows (80%), 94,805 rows remaining

Demography

metaviz_long %>%
  drop_na(value) %>%
  filter(key_category == "Demography") %>%
  ggplot(aes(key_name_label, syear)) +
  geom_count(col = "#21908CFF") +
  coord_flip() +
  theme(legend.position = "right") +
  scale_x_discrete(labels = wrap_format(40)) +
  scale_y_continuous(limits = c(1998, 2018),
                     breaks = seq(1998, 2018, 2)) +
  labs(
    title = "Number of observations for selected SOEP variables from 1998 - 2018",
    subtitle = "Size indicates number of observations",
    y = "",
    x = ""
  )
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 280,369 rows (59%), 196,193 rows remaining

Psychol. Measures

metaviz_long %>%
  drop_na(value) %>%
  filter(key_category == "Psych. Measure") %>%
  ggplot(aes(key_name_label, syear)) +
  geom_count(col = "#5DC863FF") +
  coord_flip() +
  theme(legend.position = "right") +
  scale_x_discrete(labels = wrap_format(40)) +
  scale_y_continuous(limits = c(1998, 2018),
                     breaks = seq(1998, 2018, 2)) +
  labs(
    title = "Number of observations for selected SOEP variables from 1998 - 2018",
    subtitle = "Size indicates number of observations",
    y = "",
    x = ""
  ) 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 409,994 rows (86%), 66,568 rows remaining

Other

metaviz_long %>%
  drop_na(value) %>%
  filter(key_category == "Other") %>%
  ggplot(aes(key_name_label, syear)) +
  geom_count(col = "#FDE725FF") +
  coord_flip() +
  theme(legend.position = "right") +
  scale_x_discrete(labels = wrap_format(40)) +
  scale_y_continuous(limits = c(1998, 2018),
                     breaks = seq(1998, 2018, 2)) +
  labs(
    title = "Number of observations for selected SOEP variables from 1998 - 2018",
    subtitle = "Size indicates number of observations",
    y = "",
    x = ""
  ) 
## drop_na: removed 5,920,326 rows (93%), 476,562 rows remaining
## filter: removed 410,735 rows (86%), 65,827 rows remaining

Supplement